from numpy.core.numeric import full
import requests
from bs4 import BeautifulSoup
import pandas
import re
import time
import string
from bs4 import BeautifulSoup


input_file = "LINKS_ukraina_ru_vaccine.csv"

output_file = "DATA_ukraina_ru_vaccine.csv"

# Not relevant to this scrape
include_one_of_these_terms = []
include_all_terms = []



df = pandas.read_csv(input_file)

url_list = df["url"].tolist()

print("This is the number of articles to scrape: " + str(len(url_list)))

l = []

for article in url_list:
    print("This is how many we have scraped so far: " + str(url_list.index(article) + 1) + " out of " + str(len(url_list)))
    # It is unfortunate to add this as it makes it take longer, but it's necessary if the dataset is super big and might otherwise overload the server
    time.sleep(0.5)
    d = {}
    request = requests.get(article)
    content = request.content
    soup = BeautifulSoup(content, "html.parser")
    exclist = string.punctuation + "–" + "—" + "" + "„" + "”" + "«" + "»" + "…"

    list_text = []

    teaser = soup.find("section", {"itemprop": "description"}).text.strip()
    list_text.append(teaser)

    body = soup.find("section", {"class": "article-body"})
    paras = body.find_all("p")

    for para in paras:
        # for some reason even though they are not p the find_all is getting the section class thing for insert articles - they seem to be considered within the p
        # So what I do is find the text in the insert article, if any, and remove it.
        inject_article = para.find("section", {"class": "inject-article"})
        text = para.text.strip().replace("\xa0", " ").replace("\n", " ")
        if inject_article != None:
            text = text.replace(inject_article.text, "")
        # This stops collecting text from paras after it becomes "read more" and lists other articles
        # Which unfortunately cannot be distinguished otherwise because they are just p within article-body
        if text == "Читайте также:":
            break
        else:
            list_text.append(text)

    full_text_not_clean = " ".join(list_text)
    full_text = "".join(x for x in full_text_not_clean if x not in exclist).lower()

    
    # This starts by finding out if the text has one of the either/or 
    if include_one_of_these_terms == []:
        has_or_term = "yes"
    else:
        # Starting with assumption that none of the words are in the text
        has_or_term = "no"
        for or_term in include_one_of_these_terms:
            if re.search(or_term.lower(), full_text):
                # If one of the or_terms is in the text, then we change has_or_term to yes
                has_or_term = "yes"
    if has_or_term == "no":
        # this goes back to the top, not adding this article to the list
        continue
    # Now we check if the article has all the and terms
    else:
        # start with the assumption that it does have all terms
        include = "yes"
        # If we aren't doing any filtering, we leave include as "yes"
        if include_all_terms == []:
            pass
        else:
            for and_term in include_all_terms:
                if re.search(and_term.lower(), full_text):
                    # if it has the term, we keep going and check if it has the next one, leaving include as yes
                    pass
                else:
                    # If at any point it doesn't have the term, include changes to no
                    include = "no"
    if include == "no":
        # We stop scraping if the conditions aren't met
        continue
    else:
        pass
    d["url"] = article
    title_not_clean = soup.find("h1", {"itemprop": "name"}).text
    d["title"] = "".join(x for x in title_not_clean if x not in exclist).lower()
    d["date"] = soup.find("time").text.split(",")[0]
    d["full_text"] = full_text
    l.append(d)


df = pandas.DataFrame(l)
df.to_csv(output_file, encoding = "utf-8-sig")